import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
HouseDF=pd.read_csv("kc_house_data.csv")
HouseDF.head()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows × 21 columns
HouseDF.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21613 entries, 0 to 21612 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 21613 non-null int64 1 date 21613 non-null object 2 price 21613 non-null float64 3 bedrooms 21613 non-null int64 4 bathrooms 21613 non-null float64 5 sqft_living 21613 non-null int64 6 sqft_lot 21613 non-null int64 7 floors 21613 non-null float64 8 waterfront 21613 non-null int64 9 view 21613 non-null int64 10 condition 21613 non-null int64 11 grade 21613 non-null int64 12 sqft_above 21613 non-null int64 13 sqft_basement 21613 non-null int64 14 yr_built 21613 non-null int64 15 yr_renovated 21613 non-null int64 16 zipcode 21613 non-null int64 17 lat 21613 non-null float64 18 long 21613 non-null float64 19 sqft_living15 21613 non-null int64 20 sqft_lot15 21613 non-null int64 dtypes: float64(5), int64(15), object(1) memory usage: 3.5+ MB
HouseDF.describe()
| id | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.161300e+04 | 2.161300e+04 | 21613.000000 | 21613.000000 | 21613.000000 | 2.161300e+04 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 | 21613.000000 |
| mean | 4.580302e+09 | 5.400881e+05 | 3.370842 | 2.114757 | 2079.899736 | 1.510697e+04 | 1.494309 | 0.007542 | 0.234303 | 3.409430 | 7.656873 | 1788.390691 | 291.509045 | 1971.005136 | 84.402258 | 98077.939805 | 47.560053 | -122.213896 | 1986.552492 | 12768.455652 |
| std | 2.876566e+09 | 3.671272e+05 | 0.930062 | 0.770163 | 918.440897 | 4.142051e+04 | 0.539989 | 0.086517 | 0.766318 | 0.650743 | 1.175459 | 828.090978 | 442.575043 | 29.373411 | 401.679240 | 53.505026 | 0.138564 | 0.140828 | 685.391304 | 27304.179631 |
| min | 1.000102e+06 | 7.500000e+04 | 0.000000 | 0.000000 | 290.000000 | 5.200000e+02 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 290.000000 | 0.000000 | 1900.000000 | 0.000000 | 98001.000000 | 47.155900 | -122.519000 | 399.000000 | 651.000000 |
| 25% | 2.123049e+09 | 3.219500e+05 | 3.000000 | 1.750000 | 1427.000000 | 5.040000e+03 | 1.000000 | 0.000000 | 0.000000 | 3.000000 | 7.000000 | 1190.000000 | 0.000000 | 1951.000000 | 0.000000 | 98033.000000 | 47.471000 | -122.328000 | 1490.000000 | 5100.000000 |
| 50% | 3.904930e+09 | 4.500000e+05 | 3.000000 | 2.250000 | 1910.000000 | 7.618000e+03 | 1.500000 | 0.000000 | 0.000000 | 3.000000 | 7.000000 | 1560.000000 | 0.000000 | 1975.000000 | 0.000000 | 98065.000000 | 47.571800 | -122.230000 | 1840.000000 | 7620.000000 |
| 75% | 7.308900e+09 | 6.450000e+05 | 4.000000 | 2.500000 | 2550.000000 | 1.068800e+04 | 2.000000 | 0.000000 | 0.000000 | 4.000000 | 8.000000 | 2210.000000 | 560.000000 | 1997.000000 | 0.000000 | 98118.000000 | 47.678000 | -122.125000 | 2360.000000 | 10083.000000 |
| max | 9.900000e+09 | 7.700000e+06 | 33.000000 | 8.000000 | 13540.000000 | 1.651359e+06 | 3.500000 | 1.000000 | 4.000000 | 5.000000 | 13.000000 | 9410.000000 | 4820.000000 | 2015.000000 | 2015.000000 | 98199.000000 | 47.777600 | -121.315000 | 6210.000000 | 871200.000000 |
HouseDF.columns()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_16108/2907648296.py in <module> ----> 1 HouseDF.columns() TypeError: 'Index' object is not callable
HouseDF.columns
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15'],
dtype='object')
sns.pairplot(HouseDF)
<seaborn.axisgrid.PairGrid at 0x24458cb9ca0>
sns.heatmap(HouseDF.corr(),annot=True)
<AxesSubplot:>
X=HouseDF[['bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15']]
Y=HouseDF['price']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.40, random_state=101)
X_train
| bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2435 | 3 | 2.25 | 2550 | 9674 | 1.0 | 0 | 0 | 3 | 7 | 1850 | 700 | 1959 | 0 | 98178 | 47.4856 | -122.247 | 2240 | 9674 |
| 256 | 4 | 1.75 | 2360 | 7620 | 1.0 | 0 | 0 | 4 | 7 | 1180 | 1180 | 1955 | 0 | 98106 | 47.5278 | -122.345 | 1910 | 7620 |
| 13911 | 3 | 1.75 | 1770 | 2800 | 1.5 | 0 | 0 | 3 | 7 | 1770 | 0 | 1914 | 0 | 98103 | 47.6631 | -122.357 | 1630 | 3254 |
| 1135 | 7 | 3.00 | 2940 | 8624 | 1.0 | 0 | 0 | 3 | 8 | 1690 | 1250 | 1977 | 0 | 98155 | 47.7555 | -122.307 | 1850 | 8031 |
| 12181 | 4 | 2.50 | 2210 | 7079 | 2.0 | 0 | 0 | 3 | 8 | 2210 | 0 | 1993 | 0 | 98031 | 47.4206 | -122.183 | 1970 | 7000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5695 | 3 | 2.25 | 1920 | 9672 | 2.0 | 0 | 0 | 4 | 8 | 1920 | 0 | 1984 | 0 | 98074 | 47.6233 | -122.046 | 1950 | 10125 |
| 8006 | 3 | 1.00 | 1240 | 3600 | 1.5 | 0 | 0 | 3 | 7 | 1240 | 0 | 1902 | 0 | 98144 | 47.5986 | -122.298 | 1680 | 3600 |
| 17745 | 3 | 2.25 | 1780 | 7332 | 2.0 | 0 | 0 | 3 | 7 | 1780 | 0 | 1987 | 0 | 98038 | 47.3593 | -122.051 | 1510 | 7625 |
| 17931 | 2 | 1.00 | 1150 | 5000 | 1.0 | 0 | 0 | 4 | 7 | 1050 | 100 | 1924 | 0 | 98115 | 47.6846 | -122.317 | 1463 | 4320 |
| 13151 | 3 | 1.00 | 1450 | 7930 | 1.0 | 0 | 0 | 4 | 6 | 1150 | 300 | 1923 | 0 | 98126 | 47.5212 | -122.371 | 1040 | 7740 |
12967 rows × 18 columns
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
lm.fit(X_train,Y_train)
LinearRegression()
coeff_df=pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df
| Coefficient | |
|---|---|
| bedrooms | -36819.464411 |
| bathrooms | 35835.150371 |
| sqft_living | 112.835604 |
| sqft_lot | 0.162778 |
| floors | 9277.688561 |
| waterfront | 562834.274157 |
| view | 52300.123926 |
| condition | 27570.424376 |
| grade | 97072.844829 |
| sqft_above | 73.458807 |
| sqft_basement | 39.376796 |
| yr_built | -2636.599217 |
| yr_renovated | 18.202834 |
| zipcode | -595.883695 |
| lat | 608465.676716 |
| long | -221497.479203 |
| sqft_living15 | 18.071370 |
| sqft_lot15 | -0.379984 |
prediction=lm.predict(X_test)lz
plt.scatter(Y_test,prediction)
<matplotlib.collections.PathCollection at 0x2447ecb1a00>
sns.histplot((Y_test-prediction),bins=50);
lm.score(X_test,Y_test)
0.7058600293775569
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
learning_rate = 0.1, loss = 'ls')
clf.fit(X_train, Y_train)
GradientBoostingRegressor(max_depth=5, n_estimators=400)
clf.score(X_test,Y_test)
0.8904225008916121
prediction = lm.predict(X_test)
g=plt.plot((Y_test-prediction),marker='o',linestyle='')
lm=LinearRegression(fit_intercept=False)
lm.fit(X_train,Y_train)
prediction=lm.predict(X_test)
g=plt.plot((Y_test-prediction),marker='o',linestyle='')
lm.score(X_test,Y_test)
0.7057973136589795
prediction = clf.predict(X_test)
g=plt.plot((Y_test-prediction),marker='o',linestyle='')
clf = ensemble.GradientBoostingRegressor(n_estimators = 401, max_depth = 5, min_samples_split = 2,
learning_rate = 0.1, loss = 'ls')
clf.fit(X_train, Y_train)
GradientBoostingRegressor(max_depth=5, n_estimators=401)
clf.score(X_test,Y_test)
0.886942189822708
prediction = clf.predict(X_test)
plt.scatter(Y_test,prediction)
<matplotlib.collections.PathCollection at 0x2440194fac0>
sns.distplot((Y_test-prediction),bins=50);
C:\Users\shyam\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
sns.histplot((Y_test-prediction),bins=50);
sns.displot((Y_test-prediction),bins=50);
from sklearn.linear_model import Lasso
ls=Lasso()
ls.fit(X_train,Y_train)
C:\Users\shyam\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:530: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 243429488812860.1, tolerance: 175091918878.2423 model = cd_fast.enet_coordinate_descent(
Lasso()
ls.score(X_test,Y_test)
0.7058597556855026
prediction=ls.predict(X_test)
from sklearn import metrics
score=metrics.r2_score(Y_test,prediction)
print("R Squared Error:",score)
R Squared Error: 0.7058597556855026
g=plt.plot((Y_test-prediction),marker='o',linestyle='')
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 390, max_depth = 5, min_samples_split = 2,
learning_rate = 0.1, loss = 'ls')
clf.fit(X_train, Y_train)
clf.score(X_test,Y_test)
0.8907139918590289
prediction = clf.predict(X_test)
score=metrics.r2_score(Y_test,prediction)
print("R Squared Error:",score)
R Squared Error: 0.8907139918590289
g=plt.plot((Y_test-prediction),marker='o',linestyle='')